import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#.... 1A... import dataframe
signal_df = pd.read_csv("signal-data.csv")
signal_df
| Time | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-07-19 11:55:00 | 3030.93 | 2564.00 | 2187.7333 | 1411.1265 | 1.3602 | 100.0 | 97.6133 | 0.1242 | 1.5005 | ... | NaN | 0.5005 | 0.0118 | 0.0035 | 2.3630 | NaN | NaN | NaN | NaN | -1 |
| 1 | 2008-07-19 12:32:00 | 3095.78 | 2465.14 | 2230.4222 | 1463.6606 | 0.8294 | 100.0 | 102.3433 | 0.1247 | 1.4966 | ... | 208.2045 | 0.5019 | 0.0223 | 0.0055 | 4.4447 | 0.0096 | 0.0201 | 0.0060 | 208.2045 | -1 |
| 2 | 2008-07-19 13:17:00 | 2932.61 | 2559.94 | 2186.4111 | 1698.0172 | 1.5102 | 100.0 | 95.4878 | 0.1241 | 1.4436 | ... | 82.8602 | 0.4958 | 0.0157 | 0.0039 | 3.1745 | 0.0584 | 0.0484 | 0.0148 | 82.8602 | 1 |
| 3 | 2008-07-19 14:43:00 | 2988.72 | 2479.90 | 2199.0333 | 909.7926 | 1.3204 | 100.0 | 104.2367 | 0.1217 | 1.4882 | ... | 73.8432 | 0.4990 | 0.0103 | 0.0025 | 2.0544 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
| 4 | 2008-07-19 15:22:00 | 3032.24 | 2502.87 | 2233.3667 | 1326.5200 | 1.5334 | 100.0 | 100.3967 | 0.1235 | 1.5031 | ... | NaN | 0.4800 | 0.4766 | 0.1045 | 99.3032 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1562 | 2008-10-16 15:13:00 | 2899.41 | 2464.36 | 2179.7333 | 3085.3781 | 1.4843 | 100.0 | 82.2467 | 0.1248 | 1.3424 | ... | 203.1720 | 0.4988 | 0.0143 | 0.0039 | 2.8669 | 0.0068 | 0.0138 | 0.0047 | 203.1720 | -1 |
| 1563 | 2008-10-16 20:49:00 | 3052.31 | 2522.55 | 2198.5667 | 1124.6595 | 0.8763 | 100.0 | 98.4689 | 0.1205 | 1.4333 | ... | NaN | 0.4975 | 0.0131 | 0.0036 | 2.6238 | 0.0068 | 0.0138 | 0.0047 | 203.1720 | -1 |
| 1564 | 2008-10-17 05:26:00 | 2978.81 | 2379.78 | 2206.3000 | 1110.4967 | 0.8236 | 100.0 | 99.4122 | 0.1208 | NaN | ... | 43.5231 | 0.4987 | 0.0153 | 0.0041 | 3.0590 | 0.0197 | 0.0086 | 0.0025 | 43.5231 | -1 |
| 1565 | 2008-10-17 06:01:00 | 2894.92 | 2532.01 | 2177.0333 | 1183.7287 | 1.5726 | 100.0 | 98.7978 | 0.1213 | 1.4622 | ... | 93.4941 | 0.5004 | 0.0178 | 0.0038 | 3.5662 | 0.0262 | 0.0245 | 0.0075 | 93.4941 | -1 |
| 1566 | 2008-10-17 06:07:00 | 2944.92 | 2450.76 | 2195.4444 | 2914.1792 | 1.5978 | 100.0 | 85.1011 | 0.1235 | NaN | ... | 137.7844 | 0.4987 | 0.0181 | 0.0040 | 3.6275 | 0.0117 | 0.0162 | 0.0045 | 137.7844 | -1 |
1567 rows × 592 columns
signal_df.shape
(1567, 592)
signal_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1567 entries, 0 to 1566 Columns: 592 entries, Time to Pass/Fail dtypes: float64(590), int64(1), object(1) memory usage: 7.1+ MB
signal_df.describe()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1561.000000 | 1560.000000 | 1553.000000 | 1553.000000 | 1553.000000 | 1553.0 | 1553.000000 | 1558.000000 | 1565.000000 | 1565.000000 | ... | 618.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1567.000000 |
| mean | 3014.452896 | 2495.850231 | 2200.547318 | 1396.376627 | 4.197013 | 100.0 | 101.112908 | 0.121822 | 1.462862 | -0.000841 | ... | 97.934373 | 0.500096 | 0.015318 | 0.003847 | 3.067826 | 0.021458 | 0.016475 | 0.005283 | 99.670066 | -0.867262 |
| std | 73.621787 | 80.407705 | 29.513152 | 441.691640 | 56.355540 | 0.0 | 6.237214 | 0.008961 | 0.073897 | 0.015116 | ... | 87.520966 | 0.003404 | 0.017180 | 0.003720 | 3.578033 | 0.012358 | 0.008808 | 0.002867 | 93.891919 | 0.498010 |
| min | 2743.240000 | 2158.750000 | 2060.660000 | 0.000000 | 0.681500 | 100.0 | 82.131100 | 0.000000 | 1.191000 | -0.053400 | ... | 0.000000 | 0.477800 | 0.006000 | 0.001700 | 1.197500 | -0.016900 | 0.003200 | 0.001000 | 0.000000 | -1.000000 |
| 25% | 2966.260000 | 2452.247500 | 2181.044400 | 1081.875800 | 1.017700 | 100.0 | 97.920000 | 0.121100 | 1.411200 | -0.010800 | ... | 46.184900 | 0.497900 | 0.011600 | 0.003100 | 2.306500 | 0.013425 | 0.010600 | 0.003300 | 44.368600 | -1.000000 |
| 50% | 3011.490000 | 2499.405000 | 2201.066700 | 1285.214400 | 1.316800 | 100.0 | 101.512200 | 0.122400 | 1.461600 | -0.001300 | ... | 72.288900 | 0.500200 | 0.013800 | 0.003600 | 2.757650 | 0.020500 | 0.014800 | 0.004600 | 71.900500 | -1.000000 |
| 75% | 3056.650000 | 2538.822500 | 2218.055500 | 1591.223500 | 1.525700 | 100.0 | 104.586700 | 0.123800 | 1.516900 | 0.008400 | ... | 116.539150 | 0.502375 | 0.016500 | 0.004100 | 3.295175 | 0.027600 | 0.020300 | 0.006400 | 114.749700 | -1.000000 |
| max | 3356.350000 | 2846.440000 | 2315.266700 | 3715.041700 | 1114.536600 | 100.0 | 129.252200 | 0.128600 | 1.656400 | 0.074900 | ... | 737.304800 | 0.509800 | 0.476600 | 0.104500 | 99.303200 | 0.102800 | 0.079900 | 0.028600 | 737.304800 | 1.000000 |
8 rows × 591 columns
signal_df.describe(include=object)
| Time | |
|---|---|
| count | 1567 |
| unique | 1534 |
| top | 2008-10-15 01:52:00 |
| freq | 3 |
signal_df.value_counts().isna().sum()
0
signal_df.value_counts().isnull().sum()
0
signal_df["Time"].unique()
array(['2008-07-19 11:55:00', '2008-07-19 12:32:00',
'2008-07-19 13:17:00', ..., '2008-10-17 05:26:00',
'2008-10-17 06:01:00', '2008-10-17 06:07:00'], dtype=object)
signal_df["Pass/Fail"].unique()
array([-1, 1], dtype=int64)
signal_df["Pass/Fail"].value_counts().sum()
1567
sns.histplot(signal_df["Pass/Fail"])
<AxesSubplot:xlabel='Pass/Fail', ylabel='Count'>
signal_df.groupby('Pass/Fail').size().plot(kind='pie', subplots=True, shadow=True, startangle=30, figsize=(8,6), autopct='%1.2f%%')
font1 = {'family':'serif','color':'blue','size':20}
plt.title("Pass Fail categories", fontdict = font1)
plt.tight_layout()
plt.show()
#......... 1B... 5 point summary observations
1. There are 592 columns (huge set of features)
2. Time is an object column, Pass/Fail is an int64 column having 1/-1 and remaining all are floats
3. There are no specific column names and instead has only numbers as colum names other than Time & Pass/Fail columns
4. There are lots of columns with 0 values & blank (missing) values in the dataset
5. Data needs to be massaged
6. Target variable is Pass/Fail column and has 93.36% -1 and 6.64% 1, hugely imbalanced.
orig_signal_df = signal_df.copy() #......... saving a copy of this dataset before modifying it
orig_signal_df.shape
(1567, 592)
signal_df['Time'].isnull().sum()
0
#.....2A..... to remove columns with more than 20% missing values.
drop_cols = 0
mean_cols = 0
for i in signal_df.columns:
percent_missing = signal_df[i].isnull().sum() * 100 / len(signal_df[i])
if (i == 'Time' or i == 'Pass/Fail'):
print("Skipping")
else:
if ( percent_missing > 0.20):
signal_df.drop(i,axis=1,inplace=True)
drop_cols = drop_cols+1
else:
mean = signal_df[i].mean()
signal_df[i].fillna(mean, inplace=True) #..... Imputing with mean
mean_cols = mean_cols+1
Skipping Skipping
signal_df.shape #..... this has reduced the column / feature count to 254 from 592, dropping 338 columns
(1567, 254)
print("dropped columns",drop_cols) #..... 338 columns dropped.
dropped columns 338
print("mean imputed columns",mean_cols) #.... 252 columns have got imputed with column means
mean imputed columns 252
signal_df.describe()
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | ... | 577 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.0 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | ... | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 |
| mean | 1.462862 | -0.000841 | 0.000146 | 0.964353 | 199.956809 | 0.0 | 9.005371 | 413.086035 | 9.907603 | 0.971444 | ... | 16.642363 | 0.500096 | 0.015318 | 0.003847 | 3.067826 | 0.021458 | 0.016475 | 0.005283 | 99.670066 | -0.867262 |
| std | 0.073849 | 0.015107 | 0.009296 | 0.012444 | 3.255195 | 0.0 | 2.793916 | 17.204591 | 2.401563 | 0.012051 | ... | 12.485267 | 0.003403 | 0.017174 | 0.003719 | 3.576891 | 0.012354 | 0.008805 | 0.002866 | 93.861936 | 0.498010 |
| min | 1.191000 | -0.053400 | -0.034900 | 0.655400 | 182.094000 | 0.0 | 2.249300 | 333.448600 | 4.469600 | 0.579400 | ... | 4.582000 | 0.477800 | 0.006000 | 0.001700 | 1.197500 | -0.016900 | 0.003200 | 0.001000 | 0.000000 | -1.000000 |
| 25% | 1.411250 | -0.010800 | -0.005600 | 0.958100 | 198.130950 | 0.0 | 7.096750 | 406.131000 | 9.568550 | 0.968250 | ... | 11.501550 | 0.497900 | 0.011600 | 0.003100 | 2.306500 | 0.013450 | 0.010600 | 0.003300 | 44.368600 | -1.000000 |
| 50% | 1.461600 | -0.001300 | 0.000400 | 0.965800 | 199.537100 | 0.0 | 8.973900 | 412.262900 | 9.852000 | 0.972500 | ... | 13.817900 | 0.500200 | 0.013800 | 0.003600 | 2.757700 | 0.020500 | 0.014800 | 0.004600 | 72.023000 | -1.000000 |
| 75% | 1.516850 | 0.008400 | 0.005900 | 0.971300 | 202.006750 | 0.0 | 10.858700 | 419.082800 | 10.127750 | 0.976800 | ... | 17.080900 | 0.502350 | 0.016500 | 0.004100 | 3.294950 | 0.027600 | 0.020300 | 0.006400 | 114.749700 | -1.000000 |
| max | 1.656400 | 0.074900 | 0.053000 | 0.984800 | 272.045100 | 0.0 | 19.546500 | 824.927100 | 102.867700 | 0.984800 | ... | 96.960100 | 0.509800 | 0.476600 | 0.104500 | 99.303200 | 0.102800 | 0.079900 | 0.028600 | 737.304800 | 1.000000 |
8 rows × 253 columns
before_std_drop_signal_df = signal_df.copy() #......... making a copy before dropping columns where Std dev is 0
before_std_drop_signal_df
| Time | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | ... | 577 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-07-19 11:55:00 | 1.500500 | 0.016200 | -0.003400 | 0.945500 | 202.439600 | 0.0 | 7.955800 | 414.871000 | 10.043300 | ... | 14.9509 | 0.5005 | 0.0118 | 0.0035 | 2.3630 | 0.021458 | 0.016475 | 0.005283 | 99.670066 | -1 |
| 1 | 2008-07-19 12:32:00 | 1.496600 | -0.000500 | -0.014800 | 0.962700 | 200.547000 | 0.0 | 10.154800 | 414.734700 | 9.259900 | ... | 10.9003 | 0.5019 | 0.0223 | 0.0055 | 4.4447 | 0.009600 | 0.020100 | 0.006000 | 208.204500 | -1 |
| 2 | 2008-07-19 13:17:00 | 1.443600 | 0.004100 | 0.001300 | 0.961500 | 202.017900 | 0.0 | 9.515700 | 416.707500 | 9.314400 | ... | 9.2721 | 0.4958 | 0.0157 | 0.0039 | 3.1745 | 0.058400 | 0.048400 | 0.014800 | 82.860200 | 1 |
| 3 | 2008-07-19 14:43:00 | 1.488200 | -0.012400 | -0.003300 | 0.962900 | 201.848200 | 0.0 | 9.605200 | 422.289400 | 9.692400 | ... | 8.5831 | 0.4990 | 0.0103 | 0.0025 | 2.0544 | 0.020200 | 0.014900 | 0.004400 | 73.843200 | -1 |
| 4 | 2008-07-19 15:22:00 | 1.503100 | -0.003100 | -0.007200 | 0.956900 | 201.942400 | 0.0 | 10.566100 | 420.592500 | 10.338700 | ... | 10.9698 | 0.4800 | 0.4766 | 0.1045 | 99.3032 | 0.020200 | 0.014900 | 0.004400 | 73.843200 | -1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1562 | 2008-10-16 15:13:00 | 1.342400 | -0.004500 | -0.005700 | 0.957900 | 203.986700 | 0.0 | 11.769200 | 419.340400 | 10.239700 | ... | 11.7256 | 0.4988 | 0.0143 | 0.0039 | 2.8669 | 0.006800 | 0.013800 | 0.004700 | 203.172000 | -1 |
| 1563 | 2008-10-16 20:49:00 | 1.433300 | -0.006100 | -0.009300 | 0.961800 | 204.017300 | 0.0 | 9.162000 | 405.817800 | 10.228500 | ... | 17.8379 | 0.4975 | 0.0131 | 0.0036 | 2.6238 | 0.006800 | 0.013800 | 0.004700 | 203.172000 | -1 |
| 1564 | 2008-10-17 05:26:00 | 1.462862 | -0.000841 | 0.000146 | 0.964353 | 199.956809 | 0.0 | 9.005371 | 413.086035 | 9.907603 | ... | 17.7267 | 0.4987 | 0.0153 | 0.0041 | 3.0590 | 0.019700 | 0.008600 | 0.002500 | 43.523100 | -1 |
| 1565 | 2008-10-17 06:01:00 | 1.462200 | -0.007200 | 0.003200 | 0.969400 | 197.244800 | 0.0 | 9.735400 | 401.915300 | 9.863000 | ... | 19.2104 | 0.5004 | 0.0178 | 0.0038 | 3.5662 | 0.026200 | 0.024500 | 0.007500 | 93.494100 | -1 |
| 1566 | 2008-10-17 06:07:00 | 1.462862 | -0.000841 | 0.000146 | 0.964353 | 199.956809 | 0.0 | 9.005371 | 413.086035 | 9.907603 | ... | 22.9183 | 0.4987 | 0.0181 | 0.0040 | 3.6275 | 0.011700 | 0.016200 | 0.004500 | 137.784400 | -1 |
1567 rows × 254 columns
#....... 2B......... drop all the columns which have the same data in the
stddev_drop_cols = 0
for i in signal_df.columns:
if (i == 'Time' or i == 'Pass/Fail'):
print("Skipping")
else:
std_dev = signal_df[i].std()
if ( std_dev == 0):
signal_df.drop(i,axis=1,inplace=True)
stddev_drop_cols = stddev_drop_cols+1
Skipping Skipping
stddev_drop_cols #........ found 16 columns with absolutely no data variance at all.
16
signal_df.shape #............ features now reduced to 238 from 254 levels
(1567, 238)
signal_df.drop('Time',axis=1,inplace=True) #........ dropping Time as it does not add any value to the dataset
signal_df.shape
(1567, 237)
before_var_drop_signal_df = signal_df.copy()
#......... 2C...... Dropping all the features in which variance is less than or equal to 2%
var_drop_cols = 0
#variable = [ ]
variable_df = pd.DataFrame(columns = ['column', 'variance'])
ind = 0
for i in signal_df.columns:
if (i == 'Pass/Fail'):
print("Skipping")
else:
var = signal_df[i].var() * 100
print("column is",i)
print("variance of column is",var)
tempResultsDf = pd.DataFrame({'column':i,'variance': var}, index=[ind])
ind = ind+1
variable_df = pd.concat([variable_df, tempResultsDf])
variable_df = variable_df[['column','variance']]
#variable.append(i,var)
if ( var <= 0.02):
signal_df.drop(i,axis=1,inplace=True) #..... dropping columns with variance less than 2%
var_drop_cols = var_drop_cols+1
column is 8 variance of column is 0.5453727726488192 column is 9 variance of column is 0.02282073603858347 column is 10 variance of column is 0.008641864894176945 column is 11 variance of column is 0.015485556858808786 column is 12 variance of column is 1059.629456076349 column is 14 variance of column is 780.5964129675081 column is 15 variance of column is 29599.7963687687 column is 16 variance of column is 576.750472972389 column is 17 variance of column is 0.014522306021417356 column is 18 variance of column is 771.9371004655143 column is 20 variance of column is 0.028012491013976855 column is 21 variance of column is 39240424.754859574 column is 22 variance of column is 8720786.490080751 column is 23 variance of column is 190241479.8748207 column is 24 variance of column is 841484924.132007 column is 25 variance of column is 3.1501645535766016 column is 26 variance of column is 3.5862513249058647 column is 27 variance of column is 154.61790398174716 column is 28 variance of column is 1196.4473648661553 column is 29 variance of column is 16.68176293676722 column is 30 variance of column is 0.10839172976713637 column is 31 variance of column is 28.620347663085827 column is 32 variance of column is 410.42791408600215 column is 33 variance of column is 180.64086675217698 column is 34 variance of column is 139.76926554089394 column is 35 variance of column is 662.5097036952035 column is 36 variance of column is 139.7694334021085 column is 37 variance of column is 9.244255669573242 column is 38 variance of column is 19.946309788628184 column is 39 variance of column is 326.3961632327334 column is 43 variance of column is 3884.673913500641 column is 44 variance of column is 3.0618595135600186 column is 45 variance of column is 6157.134289166374 column is 46 variance of column is 14802.19773017837 column is 47 variance of column is 3.5939191872289333 column is 48 variance of column is 2045.5775501726775 column is 50 variance of column is 7467.077019617713 column is 51 variance of column is 370949.8529256382 column is 83 variance of column is 26.63453038207705 column is 86 variance of column is 0.139366786117081 column is 87 variance of column is 0.016506704640163735 column is 88 variance of column is 286623.84441299736 column is 92 variance of column is 0.0010426184846518926 column is 93 variance of column is 0.0009047529751631118 column is 103 variance of column is 0.0009379465617209185 column is 104 variance of column is 7.227809155415178e-05 column is 113 variance of column is 0.01472065857839001 column is 114 variance of column is 0.00027825381817353154 column is 115 variance of column is 239602.90622552662 column is 116 variance of column is 0.009018359277923212 column is 117 variance of column is 4205.748769532688 column is 119 variance of column is 0.00800767608750401 column is 120 variance of column is 1.545158159043361 column is 144 variance of column is 0.37581315369982743 column is 145 variance of column is 0.07035029456624192 column is 146 variance of column is 0.047657221743192923 column is 147 variance of column is 0.07347127145369521 column is 148 variance of column is 35076.27072166416 column is 150 variance of column is 1048.941041682549 column is 151 variance of column is 95931.62615148324 column is 152 variance of column is 54483.368515134556 column is 153 variance of column is 0.008717435767410396 column is 154 variance of column is 2739.6830160195586 column is 156 variance of column is 0.6268599505444745 column is 159 variance of column is 96513938.84135322 column is 160 variance of column is 32998294.016051978 column is 161 variance of column is 1794824693.8066478 column is 162 variance of column is 4289441859.807089 column is 163 variance of column is 1.4862308153697517 column is 164 variance of column is 5.874743858510853 column is 165 variance of column is 16.57052790406349 column is 166 variance of column is 125.22519432509513 column is 167 variance of column is 39.98826092810879 column is 168 variance of column is 0.2266540602826029 column is 169 variance of column is 3.9121354975334497 column is 170 variance of column is 2.4780327924155405 column is 171 variance of column is 0.3692440857673191 column is 172 variance of column is 0.5072324773107983 column is 173 variance of column is 0.9159139960805055 column is 174 variance of column is 0.5072877308155916 column is 175 variance of column is 1.3522199457945736 column is 176 variance of column is 0.5609091396224696 column is 177 variance of column is 7.998297811084513 column is 180 variance of column is 1095.9906124396284 column is 181 variance of column is 5.032399057559343 column is 182 variance of column is 1732.8244852256364 column is 183 variance of column is 4670.243397305576 column is 184 variance of column is 1.2135780936160965 column is 185 variance of column is 5164.469993023038 column is 187 variance of column is 7408.324217091209 column is 188 variance of column is 47110.45349087087 column is 218 variance of column is 136.9045847882574 column is 221 variance of column is 0.054313359348015185 column is 222 variance of column is 0.3128921913532696 column is 223 variance of column is 304218.46312488493 column is 227 variance of column is 0.011553761055006783 column is 228 variance of column is 0.011530495758510527 column is 238 variance of column is 0.0002879344537883691 column is 239 variance of column is 0.00020749955973379974 column is 248 variance of column is 0.24240922937240925 column is 249 variance of column is 0.024871774872223668 column is 250 variance of column is 298086.2326808881 column is 251 variance of column is 0.140418766188983 column is 252 variance of column is 414153.45827378845 column is 254 variance of column is 0.008339595473694745 column is 255 variance of column is 1.4480312089365501 column is 279 variance of column is 0.04111244691303621 column is 280 variance of column is 0.0041975752757274064 column is 281 variance of column is 0.0030713087314702646 column is 282 variance of column is 0.007301540003835524 column is 283 variance of column is 3434.6372527173244 column is 285 variance of column is 92.54439185797739 column is 286 variance of column is 9514.973362280487 column is 287 variance of column is 5445.354190956492 column is 288 variance of column is 0.0008604140919736193 column is 289 variance of column is 260.96583078062713 column is 291 variance of column is 0.06527422682057518 column is 294 variance of column is 22728612.7962463 column is 295 variance of column is 8028699.008040741 column is 296 variance of column is 389608270.2152596 column is 297 variance of column is 1039974152.2576892 column is 298 variance of column is 0.4119565637733134 column is 299 variance of column is 1.7093422100155475 column is 300 variance of column is 4.7964145135299505 column is 301 variance of column is 11.007108457645899 column is 302 variance of column is 3.8961954597521595 column is 303 variance of column is 0.021028896104521397 column is 304 variance of column is 0.42024064844641906 column is 305 variance of column is 0.32912208482169913 column is 306 variance of column is 0.06447458768792125 column is 307 variance of column is 0.07539898167313372 column is 308 variance of column is 0.11277835073088938 column is 309 variance of column is 0.07541248160258952 column is 310 variance of column is 0.18875521008899235 column is 311 variance of column is 0.08286900922215198 column is 312 variance of column is 1.3754196203014588 column is 316 variance of column is 103.69428274281066 column is 317 variance of column is 0.523730723638819 column is 318 variance of column is 147.75415664604154 column is 319 variance of column is 474.5265116865604 column is 320 variance of column is 0.10159907868066428 column is 321 variance of column is 447.8803819536889 column is 323 variance of column is 634.0600851843138 column is 324 variance of column is 4374.151543388048 column is 356 variance of column is 14.960963611645298 column is 359 variance of column is 0.005092759846482496 column is 360 variance of column is 0.04001027561593251 column is 361 variance of column is 29091.750704399703 column is 365 variance of column is 0.0007045380983274795 column is 366 variance of column is 0.0005668814521032019 column is 376 variance of column is 2.8516533852349408e-05 column is 377 variance of column is 2.1738164428612806e-05 column is 386 variance of column is 0.02398822808548924 column is 387 variance of column is 0.002489345869184108 column is 388 variance of column is 29676.963993352932 column is 389 variance of column is 0.013961795973955034 column is 390 variance of column is 41316.31519193336 column is 392 variance of column is 0.0008735296297111316 column is 393 variance of column is 0.14752123041889648 column is 417 variance of column is 1641.809832242191 column is 418 variance of column is 8266815.544597451 column is 419 variance of column is 10578138.507837297 column is 420 variance of column is 933.7541463429861 column is 421 variance of column is 4774.0345053981155 column is 423 variance of column is 106052.44772774345 column is 424 variance of column is 3993.359560451612 column is 425 variance of column is 53988.508698736 column is 426 variance of column is 98.93603564492076 column is 427 variance of column is 923.691392958423 column is 429 variance of column is 4141.42508045685 column is 430 variance of column is 129866.89902059056 column is 431 variance of column is 132293.3958018753 column is 432 variance of column is 1590325.5148778968 column is 433 variance of column is 5091099.468748144 column is 434 variance of column is 116192.8084192825 column is 435 variance of column is 117977.37214485141 column is 436 variance of column is 119271.65968103397 column is 437 variance of column is 259.2887133908518 column is 438 variance of column is 116187.33976807453 column is 439 variance of column is 147085.0137103049 column is 440 variance of column is 3801.3758641201293 column is 441 variance of column is 3.3912653494843363 column is 442 variance of column is 43.42599828204926 column is 443 variance of column is 2.059403123233747 column is 444 variance of column is 2.41715563815205 column is 445 variance of column is 1.993943685861267 column is 446 variance of column is 3.101189849555281 column is 447 variance of column is 0.7470690022969751 column is 448 variance of column is 5.579038163561896 column is 452 variance of column is 84.43824402936274 column is 453 variance of column is 506.28855049213934 column is 454 variance of column is 935.5543786234824 column is 455 variance of column is 87.99799501574967 column is 456 variance of column is 6598.769413350904 column is 457 variance of column is 2057.790399206165 column is 459 variance of column is 180.94193413361225 column is 460 variance of column is 17771.370771491715 column is 490 variance of column is 32554.438480399665 column is 493 variance of column is 94.85752718627867 column is 494 variance of column is 4376.08745667032 column is 495 variance of column is 1062.772675962555 column is 499 variance of column is 10534171.630078899 column is 500 variance of column is 10419795.7547498 column is 510 variance of column is 141885.2543270213 column is 511 variance of column is 10854000.355236158 column is 520 variance of column is 3251.6982451640606 column is 521 variance of column is 1063435.2249241408 column is 522 variance of column is 5047.299525766297 column is 523 variance of column is 1720.2425110957852 column is 524 variance of column is 42697.66858652176 column is 526 variance of column is 91.85833359574673 column is 527 variance of column is 356.7178609038991 column is 542 variance of column is 0.0007480286503535585 column is 543 variance of column is 0.00023487556420582746 column is 544 variance of column is 8.722062681829122e-06 column is 545 variance of column is 172.84464097411572 column is 558 variance of column is 0.7027931273844428 column is 559 variance of column is 4.053278570585999 column is 560 variance of column is 0.2658573299651426 column is 561 variance of column is 36176.06173953618 column is 570 variance of column is 30624.076069844796 column is 571 variance of column is 7.568652701397191 column is 572 variance of column is 744849.8015119963 column is 573 variance of column is 6.174137791144954 column is 574 variance of column is 72469.44964490252 column is 575 variance of column is 0.4595635745341555 column is 576 variance of column is 28633.271876188883 column is 577 variance of column is 15588.189176043297 column is 582 variance of column is 0.001158202874297205 column is 583 variance of column is 0.029496120465380995 column is 584 variance of column is 0.0013832871251155984 column is 585 variance of column is 1279.4146055530525 column is 586 variance of column is 0.015262698833693036 column is 587 variance of column is 0.007752314414791297 column is 588 variance of column is 0.0008214849936143055 column is 589 variance of column is 881006.3098428887 Skipping
variable_df
| column | variance | |
|---|---|---|
| 0 | 8 | 0.545373 |
| 1 | 9 | 0.022821 |
| 2 | 10 | 0.008642 |
| 3 | 11 | 0.015486 |
| 4 | 12 | 1059.629456 |
| ... | ... | ... |
| 231 | 585 | 1279.414606 |
| 232 | 586 | 0.015263 |
| 233 | 587 | 0.007752 |
| 234 | 588 | 0.000821 |
| 235 | 589 | 881006.309843 |
236 rows × 2 columns
variable_df.sort_values(by=['variance'], ascending=True)
| column | variance | |
|---|---|---|
| 214 | 544 | 8.722063e-06 |
| 151 | 377 | 2.173816e-05 |
| 150 | 376 | 2.851653e-05 |
| 45 | 104 | 7.227809e-05 |
| 98 | 239 | 2.074996e-04 |
| ... | ... | ... |
| 119 | 296 | 3.896083e+08 |
| 14 | 24 | 8.414849e+08 |
| 120 | 297 | 1.039974e+09 |
| 66 | 161 | 1.794825e+09 |
| 67 | 162 | 4.289442e+09 |
236 rows × 2 columns
variable_df[variable_df['variance'] < 0.02].count()
column 38 variance 38 dtype: int64
var_drop_cols
38
signal_df.shape #........ features set has been brought down to 199 after dropping 38 columns with less than 2% variance
(1567, 199)
signal_df.describe()
| 8 | 9 | 12 | 14 | 15 | 16 | 18 | 20 | 21 | 22 | ... | 572 | 573 | 574 | 575 | 576 | 577 | 583 | 585 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | ... | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 |
| mean | 1.462862 | -0.000841 | 199.956809 | 9.005371 | 413.086035 | 9.907603 | 190.047354 | 1.405054 | -5618.393610 | 2699.378435 | ... | 28.450165 | 0.345636 | 9.162315 | 0.104729 | 5.563747 | 16.642363 | 0.015318 | 3.067826 | 99.670066 | -0.867262 |
| std | 0.073849 | 0.015107 | 3.255195 | 2.793916 | 17.204591 | 2.401563 | 2.778376 | 0.016737 | 626.421781 | 295.309778 | ... | 86.304681 | 0.248478 | 26.920150 | 0.067791 | 16.921369 | 12.485267 | 0.017174 | 3.576891 | 93.861936 | 0.498010 |
| min | 1.191000 | -0.053400 | 182.094000 | 2.249300 | 333.448600 | 4.469600 | 169.177400 | 1.179700 | -7150.250000 | 0.000000 | ... | 3.540000 | 0.066700 | 1.039500 | 0.023000 | 0.663600 | 4.582000 | 0.006000 | 1.197500 | 0.000000 | -1.000000 |
| 25% | 1.411250 | -0.010800 | 198.130950 | 7.096750 | 406.131000 | 9.568550 | 188.300650 | 1.396500 | -5932.625000 | 2578.125000 | ... | 7.500000 | 0.242250 | 2.567850 | 0.075100 | 1.408450 | 11.501550 | 0.011600 | 2.306500 | 44.368600 | -1.000000 |
| 50% | 1.461600 | -0.001300 | 199.537100 | 8.973900 | 412.262900 | 9.852000 | 189.675700 | 1.406000 | -5523.250000 | 2664.000000 | ... | 8.650000 | 0.293400 | 2.975800 | 0.089500 | 1.624500 | 13.817900 | 0.013800 | 2.757700 | 72.023000 | -1.000000 |
| 75% | 1.516850 | 0.008400 | 202.006750 | 10.858700 | 419.082800 | 10.127750 | 192.178900 | 1.415000 | -5356.625000 | 2840.625000 | ... | 10.130000 | 0.366900 | 3.492500 | 0.112150 | 1.902000 | 17.080900 | 0.016500 | 3.294950 | 114.749700 | -1.000000 |
| max | 1.656400 | 0.074900 | 272.045100 | 19.546500 | 824.927100 | 102.867700 | 215.597700 | 1.453400 | 0.000000 | 3656.250000 | ... | 454.560000 | 2.196700 | 170.020400 | 0.550200 | 90.423500 | 96.960100 | 0.476600 | 99.303200 | 737.304800 | 1.000000 |
8 rows × 199 columns
before_bi_variate_signal_df = signal_df.copy()
before_bi_variate_signal_df.shape
(1567, 199)
#....... 2D......... Bivariate checking of all predictors with target (Pass/Fail) column
corr_drop_cols = 0
corr_df = pd.DataFrame(columns = ['column', 'correlation'])
ind = 0
for i in signal_df.columns:
if (i == 'Pass/Fail'):
print("Skipping")
else:
corr_val = signal_df['Pass/Fail'].corr(signal_df[i])
corr_val = round(corr_val,3)
print("column is",i)
print("correlation of column is",corr_val)
tempResultsDf = pd.DataFrame({'column':i,'correlation': corr_val}, index=[ind])
ind = ind+1
corr_df = pd.concat([corr_df, tempResultsDf])
corr_df = corr_df[['column','correlation']]
column is 8 correlation of column is 0.028 column is 9 correlation of column is -0.031 column is 12 correlation of column is -0.006 column is 14 correlation of column is -0.069 column is 15 correlation of column is -0.003 column is 16 correlation of column is 0.002 column is 18 correlation of column is -0.009 column is 20 correlation of column is 0.023 column is 21 correlation of column is 0.108 column is 22 correlation of column is -0.073 column is 23 correlation of column is 0.013 column is 24 correlation of column is -0.018 column is 25 correlation of column is -0.036 column is 26 correlation of column is -0.081 column is 27 correlation of column is -0.029 column is 28 correlation of column is -0.107 column is 29 correlation of column is 0.017 column is 30 correlation of column is 0.037 column is 31 correlation of column is -0.049 column is 32 correlation of column is 0.052 column is 33 correlation of column is 0.081 column is 34 correlation of column is 0.003 column is 35 correlation of column is -0.012 column is 36 correlation of column is -0.003 column is 37 correlation of column is 0.041 column is 38 correlation of column is 0.053 column is 39 correlation of column is -0.002 column is 43 correlation of column is -0.02 column is 44 correlation of column is -0.023 column is 45 correlation of column is 0.008 column is 46 correlation of column is -0.009 column is 47 correlation of column is 0.028 column is 48 correlation of column is -0.015 column is 50 correlation of column is -0.015 column is 51 correlation of column is 0.006 column is 83 correlation of column is -0.024 column is 86 correlation of column is 0.025 column is 88 correlation of column is 0.027 column is 115 correlation of column is -0.044 column is 117 correlation of column is -0.017 column is 120 correlation of column is -0.02 column is 144 correlation of column is -0.038 column is 145 correlation of column is -0.044 column is 146 correlation of column is -0.044 column is 147 correlation of column is 0.008 column is 148 correlation of column is -0.006 column is 150 correlation of column is -0.012 column is 151 correlation of column is -0.022 column is 152 correlation of column is -0.005 column is 154 correlation of column is 0.002 column is 156 correlation of column is 0.002 column is 159 correlation of column is 0.078 column is 160 correlation of column is 0.09 column is 161 correlation of column is -0.02 column is 162 correlation of column is -0.006 column is 163 correlation of column is 0.1 column is 164 correlation of column is 0.088 column is 165 correlation of column is 0.084 column is 166 correlation of column is 0.052 column is 167 correlation of column is 0.022 column is 168 correlation of column is 0.007 column is 169 correlation of column is -0.011 column is 170 correlation of column is 0.037 column is 171 correlation of column is 0.003 column is 172 correlation of column is 0.037 column is 173 correlation of column is -0.007 column is 174 correlation of column is 0.037 column is 175 correlation of column is 0.043 column is 176 correlation of column is -0.014 column is 177 correlation of column is -0.004 column is 180 correlation of column is -0.079 column is 181 correlation of column is -0.043 column is 182 correlation of column is -0.031 column is 183 correlation of column is 0.056 column is 184 correlation of column is 0.003 column is 185 correlation of column is -0.009 column is 187 correlation of column is 0.006 column is 188 correlation of column is 0.047 column is 218 correlation of column is -0.015 column is 221 correlation of column is 0.022 column is 222 correlation of column is 0.031 column is 223 correlation of column is -0.001 column is 248 correlation of column is 0.009 column is 249 correlation of column is 0.066 column is 250 correlation of column is 0.007 column is 251 correlation of column is -0.01 column is 252 correlation of column is -0.007 column is 255 correlation of column is -0.012 column is 279 correlation of column is -0.038 column is 283 correlation of column is -0.007 column is 285 correlation of column is -0.012 column is 286 correlation of column is -0.021 column is 287 correlation of column is -0.005 column is 289 correlation of column is -0.003 column is 291 correlation of column is 0.0 column is 294 correlation of column is 0.082 column is 295 correlation of column is 0.092 column is 296 correlation of column is -0.016 column is 297 correlation of column is -0.004 column is 298 correlation of column is 0.102 column is 299 correlation of column is 0.088 column is 300 correlation of column is 0.084 column is 301 correlation of column is 0.048 column is 302 correlation of column is 0.025 column is 303 correlation of column is 0.008 column is 304 correlation of column is -0.008 column is 305 correlation of column is 0.021 column is 306 correlation of column is 0.003 column is 307 correlation of column is 0.022 column is 308 correlation of column is -0.002 column is 309 correlation of column is 0.022 column is 310 correlation of column is 0.039 column is 311 correlation of column is -0.019 column is 312 correlation of column is -0.003 column is 316 correlation of column is -0.089 column is 317 correlation of column is -0.04 column is 318 correlation of column is -0.036 column is 319 correlation of column is 0.053 column is 320 correlation of column is 0.003 column is 321 correlation of column is -0.01 column is 323 correlation of column is 0.006 column is 324 correlation of column is 0.04 column is 356 correlation of column is -0.01 column is 360 correlation of column is 0.039 column is 361 correlation of column is -0.005 column is 386 correlation of column is 0.008 column is 388 correlation of column is 0.02 column is 390 correlation of column is -0.007 column is 393 correlation of column is -0.003 column is 417 correlation of column is -0.043 column is 418 correlation of column is -0.03 column is 419 correlation of column is 0.014 column is 420 correlation of column is 0.007 column is 421 correlation of column is -0.005 column is 423 correlation of column is 0.048 column is 424 correlation of column is -0.023 column is 425 correlation of column is 0.014 column is 426 correlation of column is 0.021 column is 427 correlation of column is 0.001 column is 429 correlation of column is 0.0 column is 430 correlation of column is 0.11 column is 431 correlation of column is 0.12 column is 432 correlation of column is 0.004 column is 433 correlation of column is 0.048 column is 434 correlation of column is 0.112 column is 435 correlation of column is 0.109 column is 436 correlation of column is 0.106 column is 437 correlation of column is 0.07 column is 438 correlation of column is 0.016 column is 439 correlation of column is -0.02 column is 440 correlation of column is -0.007 column is 441 correlation of column is 0.033 column is 442 correlation of column is -0.013 column is 443 correlation of column is 0.037 column is 444 correlation of column is -0.007 column is 445 correlation of column is 0.038 column is 446 correlation of column is 0.042 column is 447 correlation of column is -0.015 column is 448 correlation of column is -0.004 column is 452 correlation of column is -0.077 column is 453 correlation of column is -0.042 column is 454 correlation of column is -0.033 column is 455 correlation of column is 0.056 column is 456 correlation of column is 0.001 column is 457 correlation of column is -0.008 column is 459 correlation of column is 0.006 column is 460 correlation of column is 0.061 column is 490 correlation of column is -0.007 column is 493 correlation of column is 0.019 column is 494 correlation of column is 0.035 column is 495 correlation of column is -0.004 column is 499 correlation of column is -0.03 column is 500 correlation of column is 0.015 column is 510 correlation of column is 0.132 column is 511 correlation of column is 0.055 column is 520 correlation of column is 0.008 column is 521 correlation of column is 0.037 column is 522 correlation of column is 0.014 column is 523 correlation of column is -0.01 column is 524 correlation of column is -0.006 column is 526 correlation of column is -0.022 column is 527 correlation of column is -0.01 column is 545 correlation of column is -0.046 column is 558 correlation of column is 0.023 column is 559 correlation of column is 0.024 column is 560 correlation of column is 0.02 column is 561 correlation of column is 0.026 column is 570 correlation of column is -0.002 column is 571 correlation of column is -0.019 column is 572 correlation of column is -0.032 column is 573 correlation of column is -0.052 column is 574 correlation of column is -0.035 column is 575 correlation of column is -0.053 column is 576 correlation of column is -0.028 column is 577 correlation of column is -0.05 column is 583 correlation of column is 0.006 column is 585 correlation of column is 0.005 column is 589 correlation of column is -0.003 Skipping
corr_df.shape
(198, 2)
negative_corr_df = corr_df[corr_df['correlation'] <= 0]
negative_corr_df.shape
(99, 2)
negative_corr_df.sort_values(by=['correlation'], ascending=True).head(30)
#..... the negative corelations do not seem to be strong...........
#..... strongest possible negatively corelated column seems to be column 28 with a corelation of -0.107
#..... the remaining 99 columns which are negatively corelated seems to be going much more closer to 0
| column | correlation | |
|---|---|---|
| 15 | 28 | -0.107 |
| 114 | 316 | -0.089 |
| 13 | 26 | -0.081 |
| 70 | 180 | -0.079 |
| 159 | 452 | -0.077 |
| 9 | 22 | -0.073 |
| 3 | 14 | -0.069 |
| 192 | 575 | -0.053 |
| 190 | 573 | -0.052 |
| 194 | 577 | -0.050 |
| 18 | 31 | -0.049 |
| 182 | 545 | -0.046 |
| 43 | 146 | -0.044 |
| 42 | 145 | -0.044 |
| 38 | 115 | -0.044 |
| 129 | 417 | -0.043 |
| 71 | 181 | -0.043 |
| 160 | 453 | -0.042 |
| 115 | 317 | -0.040 |
| 88 | 279 | -0.038 |
| 41 | 144 | -0.038 |
| 12 | 25 | -0.036 |
| 116 | 318 | -0.036 |
| 191 | 574 | -0.035 |
| 161 | 454 | -0.033 |
| 189 | 572 | -0.032 |
| 72 | 182 | -0.031 |
| 1 | 9 | -0.031 |
| 130 | 418 | -0.030 |
| 171 | 499 | -0.030 |
negative_corr_df.sort_values(by=['correlation'], ascending=False).head(50)
#........ these are the worst possible negatively correlated features and can be dropped as they do not
#........ contribute in any way to the Target dependent variable "Pass/Fail"
| column | correlation | |
|---|---|---|
| 139 | 429 | 0.000 |
| 94 | 291 | 0.000 |
| 81 | 223 | -0.001 |
| 26 | 39 | -0.002 |
| 187 | 570 | -0.002 |
| 109 | 308 | -0.002 |
| 197 | 589 | -0.003 |
| 128 | 393 | -0.003 |
| 23 | 36 | -0.003 |
| 93 | 289 | -0.003 |
| 113 | 312 | -0.003 |
| 4 | 15 | -0.003 |
| 170 | 495 | -0.004 |
| 158 | 448 | -0.004 |
| 98 | 297 | -0.004 |
| 69 | 177 | -0.004 |
| 92 | 287 | -0.005 |
| 133 | 421 | -0.005 |
| 124 | 361 | -0.005 |
| 48 | 152 | -0.005 |
| 45 | 148 | -0.006 |
| 179 | 524 | -0.006 |
| 2 | 12 | -0.006 |
| 54 | 162 | -0.006 |
| 65 | 173 | -0.007 |
| 154 | 444 | -0.007 |
| 86 | 252 | -0.007 |
| 150 | 440 | -0.007 |
| 127 | 390 | -0.007 |
| 167 | 490 | -0.007 |
| 89 | 283 | -0.007 |
| 164 | 457 | -0.008 |
| 105 | 304 | -0.008 |
| 30 | 46 | -0.009 |
| 75 | 185 | -0.009 |
| 6 | 18 | -0.009 |
| 85 | 251 | -0.010 |
| 181 | 527 | -0.010 |
| 178 | 523 | -0.010 |
| 122 | 356 | -0.010 |
| 119 | 321 | -0.010 |
| 61 | 169 | -0.011 |
| 22 | 35 | -0.012 |
| 46 | 150 | -0.012 |
| 87 | 255 | -0.012 |
| 90 | 285 | -0.012 |
| 152 | 442 | -0.013 |
| 68 | 176 | -0.014 |
| 78 | 218 | -0.015 |
| 157 | 447 | -0.015 |
worst_50_negative_df = negative_corr_df.sort_values(by=['correlation'], ascending=False).head(50)
worst_50_negative_df["column"]
139 429 94 291 81 223 26 39 187 570 109 308 197 589 128 393 23 36 93 289 113 312 4 15 170 495 158 448 98 297 69 177 92 287 133 421 124 361 48 152 45 148 179 524 2 12 54 162 65 173 154 444 86 252 150 440 127 390 167 490 89 283 164 457 105 304 30 46 75 185 6 18 85 251 181 527 178 523 122 356 119 321 61 169 22 35 46 150 87 255 90 285 152 442 68 176 78 218 157 447 Name: column, dtype: object
signal_df.drop(worst_50_negative_df["column"],axis=1,inplace=True)
#.... dropping the worst negatively corelated 50 columns
signal_df.shape #............. with the dropping of 50 columns the independent variable has reduced to 149
(1567, 149)
positive_corr_df = corr_df[corr_df['correlation'] > 0]
positive_corr_df.shape
(99, 2)
positive_corr_df.sort_values(by=['correlation'], ascending=False).head(10)
#....... like the Negative corelation even the positive corelations also does not look any great
#....... with the best possible positively correlated column 510 having a corelation of 0.132
| column | correlation | |
|---|---|---|
| 173 | 510 | 0.132 |
| 141 | 431 | 0.120 |
| 144 | 434 | 0.112 |
| 140 | 430 | 0.110 |
| 145 | 435 | 0.109 |
| 8 | 21 | 0.108 |
| 146 | 436 | 0.106 |
| 99 | 298 | 0.102 |
| 55 | 163 | 0.100 |
| 96 | 295 | 0.092 |
positive_corr_df.sort_values(by=['correlation'], ascending=True).head(50)
#......... the worst positively corelated independent features with the dependent variable Pass/Fail...
#......... these can also be dropped from signal_df as they do not contribute anything at all to the dependent variable
| column | correlation | |
|---|---|---|
| 163 | 456 | 0.001 |
| 138 | 427 | 0.001 |
| 5 | 16 | 0.002 |
| 50 | 156 | 0.002 |
| 49 | 154 | 0.002 |
| 74 | 184 | 0.003 |
| 21 | 34 | 0.003 |
| 63 | 171 | 0.003 |
| 118 | 320 | 0.003 |
| 107 | 306 | 0.003 |
| 142 | 432 | 0.004 |
| 196 | 585 | 0.005 |
| 76 | 187 | 0.006 |
| 34 | 51 | 0.006 |
| 120 | 323 | 0.006 |
| 165 | 459 | 0.006 |
| 195 | 583 | 0.006 |
| 84 | 250 | 0.007 |
| 132 | 420 | 0.007 |
| 60 | 168 | 0.007 |
| 125 | 386 | 0.008 |
| 175 | 520 | 0.008 |
| 104 | 303 | 0.008 |
| 29 | 45 | 0.008 |
| 44 | 147 | 0.008 |
| 82 | 248 | 0.009 |
| 10 | 23 | 0.013 |
| 136 | 425 | 0.014 |
| 131 | 419 | 0.014 |
| 177 | 522 | 0.014 |
| 172 | 500 | 0.015 |
| 148 | 438 | 0.016 |
| 16 | 29 | 0.017 |
| 168 | 493 | 0.019 |
| 185 | 560 | 0.020 |
| 126 | 388 | 0.020 |
| 137 | 426 | 0.021 |
| 106 | 305 | 0.021 |
| 59 | 167 | 0.022 |
| 79 | 221 | 0.022 |
| 110 | 309 | 0.022 |
| 108 | 307 | 0.022 |
| 183 | 558 | 0.023 |
| 7 | 20 | 0.023 |
| 184 | 559 | 0.024 |
| 103 | 302 | 0.025 |
| 36 | 86 | 0.025 |
| 186 | 561 | 0.026 |
| 37 | 88 | 0.027 |
| 0 | 8 | 0.028 |
worst_50_positive_df = positive_corr_df.sort_values(by=['correlation'], ascending=True).head(50)
worst_50_positive_df
| column | correlation | |
|---|---|---|
| 163 | 456 | 0.001 |
| 138 | 427 | 0.001 |
| 5 | 16 | 0.002 |
| 50 | 156 | 0.002 |
| 49 | 154 | 0.002 |
| 74 | 184 | 0.003 |
| 21 | 34 | 0.003 |
| 63 | 171 | 0.003 |
| 118 | 320 | 0.003 |
| 107 | 306 | 0.003 |
| 142 | 432 | 0.004 |
| 196 | 585 | 0.005 |
| 76 | 187 | 0.006 |
| 34 | 51 | 0.006 |
| 120 | 323 | 0.006 |
| 165 | 459 | 0.006 |
| 195 | 583 | 0.006 |
| 84 | 250 | 0.007 |
| 132 | 420 | 0.007 |
| 60 | 168 | 0.007 |
| 125 | 386 | 0.008 |
| 175 | 520 | 0.008 |
| 104 | 303 | 0.008 |
| 29 | 45 | 0.008 |
| 44 | 147 | 0.008 |
| 82 | 248 | 0.009 |
| 10 | 23 | 0.013 |
| 136 | 425 | 0.014 |
| 131 | 419 | 0.014 |
| 177 | 522 | 0.014 |
| 172 | 500 | 0.015 |
| 148 | 438 | 0.016 |
| 16 | 29 | 0.017 |
| 168 | 493 | 0.019 |
| 185 | 560 | 0.020 |
| 126 | 388 | 0.020 |
| 137 | 426 | 0.021 |
| 106 | 305 | 0.021 |
| 59 | 167 | 0.022 |
| 79 | 221 | 0.022 |
| 110 | 309 | 0.022 |
| 108 | 307 | 0.022 |
| 183 | 558 | 0.023 |
| 7 | 20 | 0.023 |
| 184 | 559 | 0.024 |
| 103 | 302 | 0.025 |
| 36 | 86 | 0.025 |
| 186 | 561 | 0.026 |
| 37 | 88 | 0.027 |
| 0 | 8 | 0.028 |
signal_df.drop(worst_50_positive_df["column"],axis=1,inplace=True)
#.......... dropping the worst 50 positively corelated columns
signal_df.shape #......... after dropping the worst 50 positive correlation columns, the overall features have reduced to 99
(1567, 99)
before_vif_drop_df = signal_df.copy()
before_vif_drop_df.shape
(1567, 99)
X = signal_df.drop("Pass/Fail",axis=1)
X.shape
(1567, 98)
# 2E....
# Checking the Variance inflation factor and deleting the the column with Highest VIF and looping through 80 times
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
end = len(X.columns) * 0.80
end = int(end)
for knt in range(1,end):
print("knt is",knt)
vif["variables"] = X.columns
vif["VIF"] = [variance_inflation_factor(signal_df.values, i) for i in range(X.shape[1])]
max_vif = vif[vif.loc[:,'VIF'] == vif['VIF'].max()]
index_names = vif[ vif['VIF'] == vif['VIF' ].max()].index
print("max variable",max_vif['variables'])
print("index_names",index_names)
X.drop(max_vif['variables'],axis=1,inplace=True)
vif = vif.drop(index_names,inplace=True)
vif = pd.DataFrame()
knt is 1 max variable 37 174 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 2 max variable 37 175 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 3 max variable 37 180 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 4 max variable 37 181 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 5 max variable 37 182 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 6 max variable 37 183 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 7 max variable 37 188 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 8 max variable 37 222 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 9 max variable 37 249 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 10 max variable 37 279 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 11 max variable 37 286 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 12 max variable 37 294 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 13 max variable 37 295 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 14 max variable 37 296 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 15 max variable 37 298 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 16 max variable 37 299 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 17 max variable 37 300 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 18 max variable 37 301 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 19 max variable 37 310 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 20 max variable 37 311 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 21 max variable 37 316 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 22 max variable 37 317 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 23 max variable 37 318 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 24 max variable 37 319 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 25 max variable 37 324 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 26 max variable 37 360 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 27 max variable 37 417 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 28 max variable 37 418 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 29 max variable 37 423 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 30 max variable 37 424 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 31 max variable 37 430 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 32 max variable 37 431 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 33 max variable 37 433 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 34 max variable 37 434 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 35 max variable 37 435 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 36 max variable 37 436 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 37 max variable 37 437 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 38 max variable 37 439 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 39 max variable 37 441 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 40 max variable 37 443 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 41 max variable 37 445 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 42 max variable 37 446 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 43 max variable 37 452 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 44 max variable 37 453 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 45 max variable 37 454 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 46 max variable 37 455 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 47 max variable 37 460 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 48 max variable 37 494 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 49 max variable 37 499 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 50 max variable 37 510 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 51 max variable 37 511 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 52 max variable 37 521 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 53 max variable 37 526 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 54 max variable 37 545 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 55 max variable 37 571 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 56 max variable 37 572 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 57 max variable 37 573 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 58 max variable 37 574 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 59 max variable 37 575 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 60 max variable 37 576 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 61 max variable 37 577 Name: variables, dtype: object index_names Int64Index([37], dtype='int64') knt is 62 max variable 36 172 Name: variables, dtype: object index_names Int64Index([36], dtype='int64') knt is 63 max variable 13 37 Name: variables, dtype: object index_names Int64Index([13], dtype='int64') knt is 64 max variable 13 38 Name: variables, dtype: object index_names Int64Index([13], dtype='int64') knt is 65 max variable 13 43 Name: variables, dtype: object index_names Int64Index([13], dtype='int64') knt is 66 max variable 13 44 Name: variables, dtype: object index_names Int64Index([13], dtype='int64') knt is 67 max variable 13 47 Name: variables, dtype: object index_names Int64Index([13], dtype='int64') knt is 68 max variable 13 48 Name: variables, dtype: object index_names Int64Index([13], dtype='int64') knt is 69 max variable 13 50 Name: variables, dtype: object index_names Int64Index([13], dtype='int64') knt is 70 max variable 13 83 Name: variables, dtype: object index_names Int64Index([13], dtype='int64') knt is 71 max variable 13 115 Name: variables, dtype: object index_names Int64Index([13], dtype='int64') knt is 72 max variable 13 117 Name: variables, dtype: object index_names Int64Index([13], dtype='int64') knt is 73 max variable 13 120 Name: variables, dtype: object index_names Int64Index([13], dtype='int64') knt is 74 max variable 13 144 Name: variables, dtype: object index_names Int64Index([13], dtype='int64') knt is 75 max variable 13 145 Name: variables, dtype: object index_names Int64Index([13], dtype='int64') knt is 76 max variable 13 146 Name: variables, dtype: object index_names Int64Index([13], dtype='int64') knt is 77 max variable 13 151 Name: variables, dtype: object index_names Int64Index([13], dtype='int64')
X.shape # this identified 77 VIF high columns and dropped them from the dataset reducing the Independent variables to 21
(1567, 21)
X
| 9 | 14 | 21 | 22 | 24 | 25 | 26 | 27 | 28 | 30 | ... | 32 | 33 | 159 | 160 | 161 | 163 | 164 | 165 | 166 | 170 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.016200 | 7.955800 | -5419.00 | 2916.50 | 751.00 | 0.8955 | 1.7730 | 3.0490 | 64.2333 | 0.1632 | ... | 83.3971 | 9.5126 | 1017.0 | 967.0 | 1066.0 | 0.090 | 0.048 | 0.095 | 2.0 | 0.7250 |
| 1 | -0.000500 | 10.154800 | -5441.50 | 2604.25 | -1640.25 | 1.2973 | 2.0143 | 7.3900 | 68.4222 | 0.2102 | ... | 84.9052 | 9.7997 | 568.0 | 59.0 | 297.0 | 0.112 | 0.115 | 0.124 | 2.2 | 1.0498 |
| 2 | 0.004100 | 9.515700 | -5447.75 | 2701.75 | -1916.50 | 1.3122 | 2.0295 | 7.5788 | 67.1333 | 0.1734 | ... | 84.7569 | 8.6590 | 562.0 | 788.0 | 759.0 | 0.187 | 0.117 | 0.068 | 2.1 | 1.0824 |
| 3 | -0.012400 | 9.605200 | -5468.25 | 2648.25 | -1657.25 | 1.3137 | 2.0038 | 7.3145 | 62.9333 | 0.2071 | ... | 84.9105 | 8.6789 | 859.0 | 355.0 | 3433.0 | 0.068 | 0.108 | 0.100 | 1.7 | 0.9386 |
| 4 | -0.003100 | 10.566100 | -5476.25 | 2635.25 | 117.00 | 1.2887 | 1.9912 | 7.2748 | 62.8333 | 0.2696 | ... | 86.3269 | 8.7677 | 699.0 | 283.0 | 1747.0 | 0.147 | 0.040 | 0.113 | 3.9 | 0.5760 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1562 | -0.004500 | 11.769200 | -5418.75 | 2608.00 | 356.00 | 1.2817 | 1.9540 | 7.0793 | 71.1444 | 0.1753 | ... | 83.8405 | 8.7164 | 1280.0 | 334.0 | 7112.0 | 0.123 | 0.159 | 0.260 | 5.9 | 0.7184 |
| 1563 | -0.006100 | 9.162000 | -6408.75 | 2277.50 | 339.00 | 1.0870 | 1.8023 | 5.1515 | 72.8444 | 0.1416 | ... | 84.0623 | 8.9607 | 504.0 | 94.0 | 315.0 | 0.055 | 0.043 | 0.268 | 2.7 | 0.6214 |
| 1564 | -0.000841 | 9.005371 | -5153.25 | 2707.00 | -1226.00 | 1.2930 | 1.9435 | 7.2315 | 71.2667 | 0.1659 | ... | 85.8638 | 8.1728 | 1178.0 | 542.0 | 3662.0 | 0.109 | 0.074 | 0.273 | 3.2 | 0.6512 |
| 1565 | -0.007200 | 9.735400 | -5271.75 | 2676.50 | 394.75 | 1.2875 | 1.9880 | 7.3255 | 70.5111 | 0.2386 | ... | 84.5602 | 9.1930 | 1740.0 | 252.0 | 2702.0 | 0.098 | 0.193 | 0.250 | 2.2 | 0.3993 |
| 1566 | -0.000841 | 9.005371 | -5319.50 | 2668.00 | -425.00 | 1.3020 | 2.0085 | 7.3395 | 73.0667 | 0.2021 | ... | 83.3424 | 8.7786 | 763.0 | 304.0 | 503.0 | 0.051 | 0.050 | 0.308 | 3.9 | 0.6821 |
1567 rows × 21 columns
X.describe()
| 9 | 14 | 21 | 22 | 24 | 25 | 26 | 27 | 28 | 30 | ... | 32 | 33 | 159 | 160 | 161 | 163 | 164 | 165 | 166 | 170 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | ... | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 |
| mean | -0.000841 | 9.005371 | -5618.393610 | 2699.378435 | -298.598136 | 1.203845 | 1.938477 | 6.638628 | 69.499532 | 0.184159 | ... | 85.337469 | 8.960279 | 882.680511 | 555.346326 | 4066.850479 | 0.140204 | 0.127942 | 0.252026 | 2.788882 | 0.684330 |
| std | 0.015107 | 2.793916 | 626.421781 | 295.309778 | 2900.835956 | 0.177487 | 0.189374 | 1.243454 | 3.458970 | 0.032923 | ... | 2.025902 | 1.344027 | 982.415079 | 574.441416 | 4236.537140 | 0.121911 | 0.242379 | 0.407069 | 1.119041 | 0.157418 |
| min | -0.053400 | 2.249300 | -7150.250000 | 0.000000 | -14804.500000 | 0.000000 | 0.000000 | 0.000000 | 59.400000 | 0.034100 | ... | 83.182900 | 7.603200 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.800000 | 0.297900 |
| 25% | -0.010800 | 7.096750 | -5932.625000 | 2578.125000 | -1474.375000 | 1.094900 | 1.906750 | 5.267350 | 67.383350 | 0.161800 | ... | 84.490500 | 8.580000 | 411.500000 | 295.000000 | 1322.500000 | 0.091000 | 0.068500 | 0.132000 | 2.100000 | 0.575600 |
| 50% | -0.001300 | 8.973900 | -5523.250000 | 2664.000000 | -80.500000 | 1.283000 | 1.986300 | 7.264500 | 69.155600 | 0.186700 | ... | 85.140400 | 8.770000 | 623.000000 | 438.000000 | 2620.000000 | 0.120000 | 0.089000 | 0.184000 | 2.600000 | 0.686000 |
| 75% | 0.008400 | 10.858700 | -5356.625000 | 2840.625000 | 1376.250000 | 1.304300 | 2.003200 | 7.329600 | 72.255550 | 0.207000 | ... | 85.741900 | 9.060600 | 963.500000 | 624.500000 | 5033.000000 | 0.154000 | 0.116500 | 0.254500 | 3.200000 | 0.797300 |
| max | 0.074900 | 19.546500 | 0.000000 | 3656.250000 | 14106.000000 | 1.382800 | 2.052800 | 7.658800 | 77.900000 | 0.285100 | ... | 105.603800 | 23.345300 | 7791.000000 | 4170.000000 | 37943.000000 | 0.957000 | 1.817000 | 3.286000 | 21.100000 | 1.153000 |
8 rows × 21 columns
signal_df.shape
(1567, 99)
backup_X = X.copy()
backup_X.shape
(1567, 21)
signal_new = X.copy()
signal_new.shape # With this we have successfully brought down the dataset from a high of 592 to 21
(1567, 21)
signal_new['Pass/Fail'] = signal_df['Pass/Fail'].values #........... adding the Target variable Pass/Fail to signal_new
signal_new['Pass/Fail'].value_counts()
-1 1463 1 104 Name: Pass/Fail, dtype: int64
signal_df['Pass/Fail'].value_counts()
-1 1463 1 104 Name: Pass/Fail, dtype: int64
# 3A.... Univariate Analysis
for i, col in enumerate(signal_new.columns):
fig = plt.figure(figsize = (10,5))
plt.figure(i)
ax = fig.gca()
fig.suptitle(col, fontsize=20)
sns.histplot(signal_new[col], kde=True, ax=ax)
<ipython-input-71-e513aad3098c>:2: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). fig = plt.figure(figsize = (10,5))
<Figure size 432x288 with 0 Axes>
Observations:
1. Column 9, 14, 30, 170 seem to be normally distributed
2. Column 32, 33, 160 is normally distributed with a huge right skew
3. Column 31, 21, 22, 25, seem to have 2 clusters
4. Column 27 seems to have 2 clusters and is left skewed
5. Column 26 seems to have 3 clusters
6. Column 161 is right skewed
7. Colulmn 163, 164, 165, 166 have 2 clusters with 1 normally distrubuted and the other very small with a right skew
for i in signal_new.columns:
print("column is",i)
fig = plt.figure(figsize = (6,4))
plt.figure(i)
fig.suptitle(i, fontsize=20)
signal_new[i].plot.box()
plt.show()
column is 9
<Figure size 432x288 with 0 Axes>
column is 14
<Figure size 432x288 with 0 Axes>
column is 21
<Figure size 432x288 with 0 Axes>
column is 22
<Figure size 432x288 with 0 Axes>
column is 24
<Figure size 432x288 with 0 Axes>
column is 25
<Figure size 432x288 with 0 Axes>
column is 26
<Figure size 432x288 with 0 Axes>
column is 27
<Figure size 432x288 with 0 Axes>
column is 28
<Figure size 432x288 with 0 Axes>
column is 30
<Figure size 432x288 with 0 Axes>
column is 31
<Figure size 432x288 with 0 Axes>
column is 32
<Figure size 432x288 with 0 Axes>
column is 33
<Figure size 432x288 with 0 Axes>
column is 159
<Figure size 432x288 with 0 Axes>
column is 160
<Figure size 432x288 with 0 Axes>
column is 161
<Figure size 432x288 with 0 Axes>
column is 163
<Figure size 432x288 with 0 Axes>
column is 164
<Figure size 432x288 with 0 Axes>
column is 165
<Figure size 432x288 with 0 Axes>
column is 166
<Figure size 432x288 with 0 Axes>
column is 170
<Figure size 432x288 with 0 Axes>
column is Pass/Fail
<Figure size 432x288 with 0 Axes>
# Observations:
# 1. There are too many outliers in all the columns
# 2. Column 21, 22, 24 seems to have huge outliers on both sides
# 3. Column 14, 32, 33 has outliers only above Maximum
# 4. Column 25, 26, 27 has outliers only below minimum
# 5. Column 28, 30 seems to be ok with just very few outliers below minimum
# 6. Column 31 is equally distributed with outliers above and below maximum and minimum
# 7. Column 159, 160, 161 has too many outliers above maximum
# 8. Column 163, 164, 165, 166 has too many outliers above maximum with clearly 2 bunches with a significant gap
# 9. Column 170 seems to have only 1 outlier with perfect data distribution
signal_new.shape
(1567, 22)
for i in signal_new.columns:
if (i == 'Pass/Fail'):
print("")
else:
Q1 = np.percentile(signal_new[i], 25,interpolation = 'midpoint')
Q3 = np.percentile(signal_new[i], 75,interpolation = 'midpoint')
IQR = Q3 - Q1
# Upper bound
#upper = np.where(signal_new[i] >= (Q3+1.5*IQR))
upper = (Q3+1.5*IQR)
# Lower bound
#lower = np.where(signal_new[i] <= (Q1-1.5*IQR))
lower = (Q1-1.5*IQR)
curr_val = signal_new[i]
median = signal_new[i].quantile(0.50)
print("column is",i)
print("Median is",median)
print("Upper bound is",upper)
print("Lower bound is",lower)
signal_new[i] = np.where(signal_new[i] >= upper, median, curr_val)
signal_new[i] = np.where(signal_new[i] <= lower, median, curr_val)
column is 9 Median is -0.0013 Upper bound is 0.037200000000000004 Lower bound is -0.0396 column is 14 Median is 8.9739 Upper bound is 16.501624999999997 Lower bound is 1.453825000000002 column is 21 Median is -5523.25 Upper bound is -4492.625 Lower bound is -6796.625 column is 22 Median is 2664.0 Upper bound is 3234.375 Lower bound is 2184.375 column is 24 Median is -80.5 Upper bound is 5652.1875 Lower bound is -5750.3125 column is 25 Median is 1.283 Upper bound is 1.6184 Lower bound is 0.7807999999999999 column is 26 Median is 1.9863 Upper bound is 2.147875 Lower bound is 1.7620750000000003 column is 27 Median is 7.2645 Upper bound is 10.422975 Lower bound is 2.173975000000001 column is 28 Median is 69.1556 Upper bound is 79.56384999999999 Lower bound is 60.07505000000002 column is 30 Median is 0.1867 Upper bound is 0.27480000000000004 Lower bound is 0.09399999999999997 column is 31 Median is 3.4314 Upper bound is 3.7897000000000007 Lower bound is 3.1064999999999996 column is 32 Median is 85.1404 Upper bound is 87.619 Lower bound is 82.61339999999998 column is 33 Median is 8.77 Upper bound is 9.781500000000001 Lower bound is 7.859099999999999 column is 159 Median is 623.0 Upper bound is 1791.5 Lower bound is -416.5 column is 160 Median is 438.0 Upper bound is 1118.75 Lower bound is -199.25 column is 161 Median is 2620.0 Upper bound is 10598.75 Lower bound is -4243.25 column is 163 Median is 0.12 Upper bound is 0.2485 Lower bound is -0.003500000000000003 column is 164 Median is 0.089 Upper bound is 0.18849999999999972 Lower bound is -0.0034999999999998366 column is 165 Median is 0.184 Upper bound is 0.43825000000000003 Lower bound is -0.05174999999999999 column is 166 Median is 2.6 Upper bound is 4.8500000000000005 Lower bound is 0.44999999999999996 column is 170 Median is 0.6859999999999999 Upper bound is 1.12985 Lower bound is 0.24305
for i in signal_new.columns:
print("column is",i)
fig = plt.figure(figsize = (6,4))
plt.figure(i)
fig.suptitle(i, fontsize=20)
signal_new[i].plot.box()
plt.show()
column is 9
<Figure size 432x288 with 0 Axes>
column is 14
<Figure size 432x288 with 0 Axes>
column is 21
<Figure size 432x288 with 0 Axes>
column is 22
<Figure size 432x288 with 0 Axes>
column is 24
<Figure size 432x288 with 0 Axes>
column is 25
<Figure size 432x288 with 0 Axes>
column is 26
<Figure size 432x288 with 0 Axes>
column is 27
<Figure size 432x288 with 0 Axes>
column is 28
<Figure size 432x288 with 0 Axes>
column is 30
<Figure size 432x288 with 0 Axes>
column is 31
<Figure size 432x288 with 0 Axes>
column is 32
<Figure size 432x288 with 0 Axes>
column is 33
<Figure size 432x288 with 0 Axes>
column is 159
<Figure size 432x288 with 0 Axes>
column is 160
<Figure size 432x288 with 0 Axes>
column is 161
<Figure size 432x288 with 0 Axes>
column is 163
<Figure size 432x288 with 0 Axes>
column is 164
<Figure size 432x288 with 0 Axes>
column is 165
<Figure size 432x288 with 0 Axes>
column is 166
<Figure size 432x288 with 0 Axes>
column is 170
<Figure size 432x288 with 0 Axes>
column is Pass/Fail
<Figure size 432x288 with 0 Axes>
# Most of the outliers have been taken care but for some of the residual outliers
#..... 3B..... Bivariate analysis.... Checking each independent variable with dependent variable ("Pass/Fail")
for i in signal_new.columns:
if i == 'Pass/Fail':
print("Nothing",i)
else:
print("column is",i)
plt.figure()
#sns.jointplot(x=i, y='Pass/Fail', data=signal_new, hue='Pass/Fail');
g=sns.scatterplot(x=i, y="Pass/Fail",hue="Pass/Fail",data=signal_new,palette=['green','orange'], legend='full')
plt.show()
column is 9
column is 14
column is 21
column is 22
column is 24
column is 25
column is 26
column is 27
column is 28
column is 30
column is 31
column is 32
column is 33
column is 159
column is 160
column is 161
column is 163
column is 164
column is 165
column is 166
column is 170
Nothing Pass/Fail
Observations:
1. There doesnt seem to be any significant linear relationship between the independent and dependent variables for most
of the columns
#.... 3B..... Multivariate analysis
sns.set(rc={'figure.figsize':(18.7,1200.27)})
sns. set(style="ticks", color_codes=True)
sns.pairplot(signal_new);